set.seed(12071999)
library(httr)
library(jsonlite)
library(modelr)
library(parsnip)
library(patchwork)
library(recipes)
library(scales)
library(sf)
library(stringr)
library(tidycensus)
library(tidymodels)
library(tidyr)
library(tidyverse)
library(tigris)
library(vip)
library(yardstick)Predicting the 2024 Presidential Election County Level Results
Intro to Data Science Final Project
Data cleaning & structuring
2020/2016 data
Download 2020/2016 election results data & merge into single file
Data on 2016 and 2020 U.S. presidential elections results come from this public GitHub repository, which compiles results from reputable sources, including Politico and the New York Times. This data includes presidential election vote margins on a county-level (our outcome variable). W
elections2016 <- read_csv("data/2016_US_County_Level_Presidential_Results.csv")
elections2020 <- read_csv("data/2020_US_County_Level_Presidential_Results.csv")elections2020 <- elections2020 |>
mutate(county_fips = as.character(str_remove(county_fips, "^0")))
elections2016 <- elections2016 |>
mutate(county_fips = as.character(combined_fips)) |>
select(-combined_fips, -diff) |>
mutate(diff = (votes_gop - votes_dem))
elections2020_2016 <- left_join(elections2020,
elections2016,
by = "county_fips")
colnames(elections2020_2016) <- gsub("\\.x$", "_current", colnames(elections2020_2016))
colnames(elections2020_2016) <- gsub("\\.y$", "_previous", colnames(elections2020_2016))
elections2020_2016 <- elections2020_2016 |>
mutate(county_name = county_name_current) |>
select(-county_name_previous, -county_name_current)Download county size data
county_size <- read_csv("data/LND01.csv") |>
select(STCOU,
LND010190D) |>
rename(geoid = STCOU,
land_area = LND010190D) |>
mutate(geoid = sub("^0+", "", geoid),
land_area = if_else(geoid == 8014, 32.97, land_area)) # manually inputting land area value for Broomfield County, Colorado, since it was incorrectly listed as zero.
# manually input data for Connecticut county-equivalents, as Census switched from using counties to Connecticut's Councils of Government divisions in 2022.
ct_county_eq_size <- data.frame(
geoid = c(9110, 9120, 9130, 9140, 9150, 9160, 9170, 9180, 9190),
land_area = c(1027.3, 140.2, 424.1, 412.8, 553.9, 786.6, 367.2, 598.1, 532.1) # manually inputting rows for Connnecticut post-2022 resturcturing of counties and county-equivalent entities
) |>
mutate(geoid = as.character(geoid))
county_size <- county_size |>
bind_rows(ct_county_eq_size)Download county and state geospatial data
counties_geospatial <- counties(cb = TRUE) |>
mutate(GEOID = sub("^0", "", GEOID)) |>
rename(county_fips = GEOID) |>
select(county_fips, geometry)
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|========================================================== | 84%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 94%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
state_geospatial <- states(cb = TRUE) |>
rename(state_name = NAME) |>
select(state_name, geometry)
|
| | 0%
|
|= | 1%
|
|== | 3%
|
|=== | 4%
|
|==== | 6%
|
|===== | 6%
|
|===== | 8%
|
|====== | 8%
|
|======= | 10%
|
|======== | 12%
|
|========= | 12%
|
|========== | 14%
|
|=========== | 15%
|
|============ | 17%
|
|============= | 19%
|
|============== | 21%
|
|=============== | 22%
|
|================ | 24%
|
|================== | 25%
|
|=================== | 27%
|
|===================== | 29%
|
|====================== | 31%
|
|======================= | 32%
|
|======================== | 34%
|
|========================= | 36%
|
|========================== | 37%
|
|============================ | 40%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 45%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 51%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================= | 59%
|
|========================================== | 59%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|================================================ | 69%
|
|================================================= | 70%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 84%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|=============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================= | 92%
|
|================================================================== | 94%
|
|=================================================================== | 95%
|
|==================================================================== | 97%
|
|===================================================================== | 99%
|
|======================================================================| 100%
Download 2019 predictor data via API
We use 2019 predictor data rather than 2020 due to ACS 1-year estimate availability. Many demographic variables (racial compositions, age distribution, etc.) do not change significantly year-on-year, so 2019 data should suffice for our purposes.
Why did we choose these predictors? According to a study conducted by Kulachi et al. (2023), voting behavior is very dynamic. People’s voting patterns depend on a culmination of numerous factors. Some factors shown to impact voting behavior that are included in this analysis are economic, gender, ethnicity and race, and age variables. Thus, we pulled predictors within these groupings. Other factors that have been shown to influence voting behaviors, like health care experiences and media influences, are difficult to estimate at an individual level. Thus, these are much more difficult to estimate at a county level. Due to the complexities in estimating these variables and limited data access, we do not include these variables in our analysis.
# download county demographic data
predictors2019 <- get_acs(dataset = "acs5",
year = 2019,
geography = "county",
variables = c(
# educational attainment
count18to24 = "S1501_C01_001",
count24to34 = "S1501_C01_016",
count35to44 = "S1501_C01_019",
count45to64 = "S1501_C01_022",
count65over = "S1501_C01_025",
countlessthanhs = "S1501_C01_002",
counthsgrad = "S1501_C01_003",
countsomecollegeassociates = "S1501_C01_004",
countbachhigher = "S1501_C01_005",
# total population
totalpopulation = "B01003_001",
# demographic information
maleratioper100females = "DP05_0004",
medianage = "DP05_0018",
countwhite = "DP05_0037",
countblack = "DP05_0038",
counthispanic = "DP05_0071",
# income
medianincome = "S1901_C01_012",
medianhhincome = "S1901_C02_012",
countbelowpoverty = "S1701_C02_001",
medianhousingcosts = "S2503_C01_024",
gini = "B19083_001",
# employment
countlaborforce16plus = "DP03_0002",
countunemployedinlaborforce16plus = "DP03_0005",
# foreign born
countforeignborncitizen = "B05002_013",
countforeignbornundocumented = "B05002_021")) |>
select(!moe) |>
pivot_wider(names_from = variable,
values_from = estimate)
# We convert our predictors to proportions
predictors2019 <- predictors2019 |>
mutate(count18over = count18to24 + count24to34 + count35to44 + count45to64 + count65over,
prop_less_than_hs = countlessthanhs / count18to24,
prop_hs_grad = counthsgrad / count18to24,
prop_some_college_associates = countsomecollegeassociates / count18to24,
prop_bachelors_higher = countbachhigher / count18to24,
prop_18_to_24 = count18to24 / totalpopulation,
prop_65_years_older = count65over / totalpopulation,
prop_white = countwhite / totalpopulation,
prop_black = countblack / totalpopulation,
prop_hispanic = counthispanic / totalpopulation,
poverty_rate = countbelowpoverty / totalpopulation,
unemployment_rate = countunemployedinlaborforce16plus / countlaborforce16plus,
prop_foreign_born_citizen = countforeignborncitizen / totalpopulation,
prop_undocumented = countforeignbornundocumented / totalpopulation,
year = 2019) |>
rename(male_ratio_per_100_females = maleratioper100females,
median_age = medianage,
median_income = medianincome,
median_housing_costs = medianhousingcosts,
total_population = totalpopulation,
geoid = GEOID,
name = NAME) |>
select(geoid,
name,
total_population,
prop_less_than_hs,
prop_hs_grad,
prop_some_college_associates,
prop_bachelors_higher,
prop_18_to_24,
prop_65_years_older,
prop_white,
prop_black,
prop_hispanic,
poverty_rate,
unemployment_rate,
male_ratio_per_100_females,
median_age,
median_income,
gini,
median_housing_costs,
prop_foreign_born_citizen,
prop_undocumented) |>
mutate(geoid = sub("^0+", "", geoid))
# merging county size predictors and calculating population density
predictors2019 <- left_join(x = predictors2019,
y = county_size,
by = "geoid") |>
mutate(land_area = as.numeric(land_area),
population_density = total_population / land_area)Merge elections & predictor dataframes
predictors2019 <- predictors2019 |>
mutate(county_fips = geoid) |>
select(-name, -geoid)
finaldata2020 <- left_join(x = elections2020_2016,
y = predictors2019,
by = "county_fips")
finaldata2020 <- finaldata2020 |> # reordering columns
select(state_name,
county_name,
county_fips,
total_votes_current,
votes_gop_current,
votes_dem_current,
diff_current,
per_gop_current,
per_dem_current,
per_point_diff_current,
total_votes_previous,
votes_gop_previous,
votes_dem_previous,
diff_previous,
per_point_diff_previous,
land_area,
total_population,
population_density,
prop_less_than_hs,
prop_hs_grad,
prop_some_college_associates,
prop_bachelors_higher,
prop_18_to_24,
prop_65_years_older,
prop_white,
prop_black,
prop_hispanic,
prop_foreign_born_citizen,
prop_undocumented,
poverty_rate,
unemployment_rate,
gini,
median_age,
median_income,
median_housing_costs,
male_ratio_per_100_females)Create predictor: swing county
This predictor will indicate whether or not the county switched from voting for the Republican candidate to the Democratic candidate, or vice versa, between 2012 and 2016. We anticipate that whether or not the county has been a swing county historically will help to predict 2020 margins. The relationship between 2016 margin size (another predictor) and 2020 margin size (our outcome variable) will likely be weaker for swing counties, as these counties are more likely to switch their party preference. We will include an interaction variable between the swing predictor and the previous election margin predictor to account for this relationship.
elections2016_2012 <- read_csv("data/US_County_Level_Presidential_Results_12-16.csv")
margins2012 <- elections2016_2012 |>
mutate(diff2012 = votes_gop_2012 - votes_dem_2012,
county_fips = as.character(combined_fips)) |>
select(county_fips, diff2012)
finaldata2020 <- left_join(x = finaldata2020,
y = margins2012,
by = "county_fips") |>
mutate(swing = case_when(
(diff_previous > 0 & diff2012 < 0) ~ 1,
(diff_previous < 0 & diff2012 > 0) ~ 1,
TRUE ~ 0))Rectify missing data
The only observations in our dataset with missing variables are observations in the state of Alaska. The elections dataset divides Alaska into its 40 state-level congressional districts, but our predictor dataset divides Alaska by its 30 boroughs and census areas (Alaskan county-equivalents). We are thus removing all Alaskan observations from our dataset, limiting the external validity of our model.
There is also a single observation in South Dakota that is missing data. We drop this variable rather than impute for the sake of time, since a single observation should not significantly impact results.
finaldata2020 |>
filter(if_any(everything(), is.na))# A tibble: 41 × 38
state_name county_name county_fips total_votes_current votes_gop_current
<chr> <chr> <chr> <dbl> <dbl>
1 Alaska District 1 2901 7360 3511
2 Alaska District 2 2902 6161 3674
3 Alaska District 3 2903 8385 6076
4 Alaska District 4 2904 10587 4690
5 Alaska District 5 2905 8706 4077
6 Alaska District 6 2906 9518 5770
7 Alaska District 7 2907 9664 7027
8 Alaska District 8 2908 9957 7618
9 Alaska District 9 2909 11047 7787
10 Alaska District 10 2910 11256 8081
# ℹ 31 more rows
# ℹ 33 more variables: votes_dem_current <dbl>, diff_current <dbl>,
# per_gop_current <dbl>, per_dem_current <dbl>, per_point_diff_current <dbl>,
# total_votes_previous <dbl>, votes_gop_previous <dbl>,
# votes_dem_previous <dbl>, diff_previous <dbl>,
# per_point_diff_previous <chr>, land_area <dbl>, total_population <dbl>,
# population_density <dbl>, prop_less_than_hs <dbl>, prop_hs_grad <dbl>, …
finaldata2020 <- finaldata2020 |>
filter(state_name != "Alaska") |>
filter(county_fips != 46102)
finaldata2020 |>
filter(if_any(everything(), is.na)) ## confirmed no missing data# A tibble: 0 × 38
# ℹ 38 variables: state_name <chr>, county_name <chr>, county_fips <chr>,
# total_votes_current <dbl>, votes_gop_current <dbl>,
# votes_dem_current <dbl>, diff_current <dbl>, per_gop_current <dbl>,
# per_dem_current <dbl>, per_point_diff_current <dbl>,
# total_votes_previous <dbl>, votes_gop_previous <dbl>,
# votes_dem_previous <dbl>, diff_previous <dbl>,
# per_point_diff_previous <chr>, land_area <dbl>, total_population <dbl>, …
Checking for missing land_area data
finaldata2020 |>
summarize(min_land_area = min(land_area)) # A tibble: 1 × 1
min_land_area
<dbl>
1 1.83
land_missing <- finaldata2020 |>
filter(land_area == 0)
land_missing_unique <- unique(land_missing[, c("state_name", "county_name")])
print(land_missing_unique) # no more counties with missing land area. # A tibble: 0 × 2
# ℹ 2 variables: state_name <chr>, county_name <chr>
2024/2020 data
Download 2024 election results data
elections2024 <- read_csv("data/2024_US_County_Level_Presidential_Results.csv")Rolling up DC elections data into a single row
DC data is presented by ward in the elections dataset, but not the 2022 predictors dataset. We roll the elections data into a single observation to match our predictor data.
dc_data <- elections2024 |>
filter(state_name == "District of Columbia") |>
mutate(
weighted_percent_votes_gop = per_gop * total_votes,
weighted_percent_votes_dem = per_dem * total_votes,
weighted_percent_points_diff = per_point_diff * total_votes
) |>
summarize(
county_fips = 11001,
votes_gop = sum(votes_gop),
votes_dem = sum(votes_dem),
total_votes = sum(total_votes),
diff = sum(diff),
per_gop = sum(weighted_percent_votes_gop) / sum(total_votes),
per_dem = sum(weighted_percent_votes_dem) / sum(total_votes),
per_point_diff = sum(weighted_percent_points_diff) / sum(total_votes),
state_name = "District of Columbia",
county_name = "District of Columbia"
) |>
mutate(county_fips = as.character(county_fips))Merge 2024/2020 elections data into single file
elections2024 <- elections2024 |>
filter(!county_fips %in% c(11001, 11002, 11003, 11004, 11005, 11006, 11007, 11008)) |> # dc data inputted manually from census bureau information
bind_rows(dc_data)
# rest of merge
elections2024 <- elections2024 |>
mutate(county_fips = as.character(str_remove(county_fips, "^0")))
elections2024_2020 <- left_join(elections2024,
elections2020,
by = "county_fips")
colnames(elections2024_2020) <- gsub("\\.x$", "_current", colnames(elections2024_2020))
colnames(elections2024_2020) <- gsub("\\.y$", "_previous", colnames(elections2024_2020))
elections2024_2020 <- elections2024_2020 |>
mutate(county_name = county_name_current) |>
select(-county_name_previous, -county_name_current)
elections2024_2020 <- elections2024_2020 |>
mutate(state_name = state_name_current) |>
select(-state_name_previous, -state_name_current)Download 2022 predictor data via API
Similar to our 2020 dataset, we use 2022 predictor data rather than 2024 due to ACS 1-year estimate availability. Many demographic variables (racial compositions, age distribution, etc.) do not change significantly year-on-year, so 2022 data should suffice for our purposes.
# download county demographic data
predictors2022 <- get_acs(dataset = "acs5",
year = 2022,
geography = "county",
variables = c(
# educational attainment
count18to24 = "S1501_C01_001",
count24to34 = "S1501_C01_016",
count35to44 = "S1501_C01_019",
count45to64 = "S1501_C01_022",
count65over = "S1501_C01_025",
countlessthanhs = "S1501_C01_002",
counthsgrad = "S1501_C01_003",
countsomecollegeassociates = "S1501_C01_004",
countbachhigher = "S1501_C01_005",
# total population
totalpopulation = "B01003_001",
# demographic information
maleratioper100females = "DP05_0004",
medianage = "DP05_0018",
countwhite = "DP05_0037",
countblack = "DP05_0038",
counthispanic = "DP05_0071",
# income
medianincome = "S1901_C01_012",
medianhhincome = "S1901_C02_012",
countbelowpoverty = "S1701_C02_001",
medianhousingcosts = "S2503_C01_024",
gini = "B19083_001",
# employment
countlaborforce16plus = "DP03_0002",
countunemployedinlaborforce16plus = "DP03_0005",
# foreign born
countforeignborncitizen = "B05002_013",
countforeignbornundocumented = "B05002_021")) |>
select(!moe) |>
pivot_wider(names_from = variable,
values_from = estimate)
predictors2022 <- predictors2022 |>
mutate(count18over = count18to24 + count24to34 + count35to44 + count45to64 + count65over,
prop_less_than_hs = countlessthanhs / count18to24,
prop_hs_grad = counthsgrad / count18to24,
prop_some_college_associates = countsomecollegeassociates / count18to24,
prop_bachelors_higher = countbachhigher / count18to24,
prop_18_to_24 = count18to24 / totalpopulation,
prop_65_years_older = count65over / totalpopulation,
prop_white = countwhite / totalpopulation,
prop_black = countblack / totalpopulation,
prop_hispanic = counthispanic / totalpopulation,
poverty_rate = countbelowpoverty / totalpopulation,
unemployment_rate = countunemployedinlaborforce16plus / countlaborforce16plus,
prop_foreign_born_citizen = countforeignborncitizen / totalpopulation,
prop_undocumented = countforeignbornundocumented / totalpopulation,
year = 2022) |>
rename(male_ratio_per_100_females = maleratioper100females,
median_age = medianage,
median_income = medianincome,
median_housing_costs = medianhousingcosts,
total_population = totalpopulation,
geoid = GEOID,
name = NAME) |>
select(geoid,
name,
total_population,
prop_less_than_hs,
prop_hs_grad,
prop_some_college_associates,
prop_bachelors_higher,
prop_18_to_24,
prop_65_years_older,
prop_white,
prop_black,
prop_hispanic,
poverty_rate,
unemployment_rate,
male_ratio_per_100_females,
median_age,
median_income,
gini,
median_housing_costs,
prop_foreign_born_citizen,
prop_undocumented) |>
mutate(geoid = sub("^0+", "", geoid))
# merging county size predictors and calculating population density
predictors2022 <- left_join(x = predictors2022,
y = county_size,
by = "geoid") |>
mutate(land_area = as.numeric(land_area),
population_density = total_population / land_area)Merge elections & predictor dataframes
predictors2022 <- predictors2022 |>
mutate(county_fips = geoid) |>
select(-name, -geoid)
finaldata2024 <- left_join(x = elections2024_2020,
y = predictors2022,
by = "county_fips") Create predictor: swing county
This predictor will indicate whether or not the county switched from voting for the Republican candidate to the Democratic candidate, or vice versa, between 2016 and 2020. See prior justification.
margins2016 <- finaldata2020 |>
mutate(diff2016 = diff_previous) |>
select(county_fips, diff2016)
finaldata2024 <- left_join(x = finaldata2024,
y = margins2016,
by = "county_fips") |>
mutate(swing = case_when(
(diff_previous > 0 & diff2016 < 0) ~ 1,
(diff_previous < 0 & diff2016 > 0) ~ 1,
TRUE ~ 0))Rectify missing data
As in the final 2020 data, we remove Alaskan observations and the singular South Dakota observation, for the same reasons we did in the 2020 data.
We also drop Kenedy and Loving Counties, both in Texas, as they are two of the smallest counties in the U.S. with a permanent population. Observations are missing median income and housing cost data.
Beginning in 2022, Connecticut switched from using counties to using Councils of Government for statistical reporting purposes. The new county-equivalents do not match the previous boundaries of counties. Connecticut is thus missing 2020 elections results data, which will be rectified via knn imputation in our recipe.
finaldata2024 |>
filter(if_any(everything(), is.na))# A tibble: 52 × 40
county_fips votes_gop_current votes_dem_current total_votes_current
<chr> <dbl> <dbl> <dbl>
1 2001 4859 3364 8223
2 2002 4533 4569 9102
3 2003 4495 6130 10625
4 2004 2690 6160 8850
5 2005 4351 3325 7676
6 2006 6700 4772 11472
7 2007 6775 2513 9288
8 2008 7949 2575 10524
9 2009 5575 5846 11421
10 2010 4354 4068 8422
# ℹ 42 more rows
# ℹ 36 more variables: diff_current <dbl>, per_gop_current <dbl>,
# per_dem_current <dbl>, per_point_diff_current <dbl>,
# votes_gop_previous <dbl>, votes_dem_previous <dbl>,
# total_votes_previous <dbl>, diff_previous <dbl>, per_gop_previous <dbl>,
# per_dem_previous <dbl>, per_point_diff_previous <dbl>, county_name <chr>,
# state_name <chr>, total_population <dbl>, prop_less_than_hs <dbl>, …
finaldata2024 <- finaldata2024 |>
filter(state_name != "Alaska") |>
filter(county_fips != 46102 &
county_fips != 48261 &
county_fips != 48301)
finaldata2024 |>
filter(if_any(everything(), is.na)) ## confirmed no missing data outside of Connecticut 2020 voting data# A tibble: 9 × 40
county_fips votes_gop_current votes_dem_current total_votes_current
<chr> <dbl> <dbl> <dbl>
1 9110 181038 285105 474268
2 9120 52291 83719 138055
3 9130 44318 58360 104480
4 9140 115290 99237 217520
5 9150 29028 21165 50962
6 9160 31961 31145 64101
7 9170 105522 159331 270444
8 9180 58392 76146 136954
9 9190 121477 180079 306335
# ℹ 36 more variables: diff_current <dbl>, per_gop_current <dbl>,
# per_dem_current <dbl>, per_point_diff_current <dbl>,
# votes_gop_previous <dbl>, votes_dem_previous <dbl>,
# total_votes_previous <dbl>, diff_previous <dbl>, per_gop_previous <dbl>,
# per_dem_previous <dbl>, per_point_diff_previous <dbl>, county_name <chr>,
# state_name <chr>, total_population <dbl>, prop_less_than_hs <dbl>,
# prop_hs_grad <dbl>, prop_some_college_associates <dbl>, …
finaldata2024 <- finaldata2024 |> # reordering columns
select(state_name,
county_name,
county_fips,
total_votes_current,
votes_gop_current,
votes_dem_current,
diff_current,
per_gop_current,
per_dem_current,
per_point_diff_current,
total_votes_previous,
votes_gop_previous,
votes_dem_previous,
diff_previous,
per_point_diff_previous,
land_area,
total_population,
population_density,
prop_less_than_hs,
prop_hs_grad,
prop_some_college_associates,
prop_bachelors_higher,
prop_18_to_24,
prop_65_years_older,
prop_white,
prop_black,
prop_hispanic,
prop_foreign_born_citizen,
prop_undocumented,
poverty_rate,
unemployment_rate,
gini,
median_age,
median_income,
median_housing_costs,
male_ratio_per_100_females,
swing)Set up testing environment using 2020 data
Initial split
set.seed(12071999)
modeling_sample <- initial_split(finaldata2020)
train <- training(modeling_sample)
test <- testing(modeling_sample)Exploratory analysis
# us map of vote outcomes by party
countymap_winner <- train |>
left_join(counties_geospatial, by = "county_fips") |>
select(diff_current, geometry) |>
mutate(winner = if_else(diff_current > 0, "REP", "DEM")) |>
st_as_sf() |>
ggplot() +
geom_sf(aes(fill = winner)) +
scale_fill_manual(values = c(
"DEM" = "royalblue4",
"REP" = "firebrick3"),
name = "Winner"
)
# us map of white proportions by county
map_propwhite <- train |>
left_join(counties_geospatial, by = "county_fips") |>
select(county_fips, prop_white, geometry) |>
st_as_sf() |>
ggplot() +
geom_sf(aes(fill = prop_white)) +
scale_fill_gradient(
low = "white",
high = "chartreuse3"
)
countymap_winner + map_propwhite# us map of black proportions by county
map_propblack <- train |>
left_join(counties_geospatial, by = "county_fips") |>
select(county_fips, prop_black, geometry) |>
st_as_sf() |>
ggplot() +
geom_sf(aes(fill = prop_black)) +
scale_fill_gradient(
low = "white",
high = "royalblue3"
)
# us map of hispanic proportions by county
map_prophispanic <- train |>
left_join(counties_geospatial, by = "county_fips") |>
select(county_fips, prop_hispanic, geometry) |>
st_as_sf() |>
ggplot() +
geom_sf(aes(fill = prop_hispanic)) +
scale_fill_gradient(
low = "white",
high = "darkgoldenrod1"
)
map_propblack + map_prophispanicAdd explanation
options(sciepen = 999)
theme_set(theme_minimal())
train |>
mutate(winner = ifelse(diff_current < 0, "dem", "rep")) |>
ggplot(aes(x = diff_current, fill = winner)) +
geom_histogram(binwidth = 5000, color = "black") +
scale_x_continuous(labels = scales::number_format(scale = 1), limits = c(-200000, 200000)) +
scale_fill_manual(values = c("dem" = "blue", "rep" = "red")) +
labs(title = "Count of Party Wins by County Margin in 2020",
x = "Differential in Number of Votes",
y = "Count")train |>
mutate(pop_size_decile = ntile(total_population, 10)) |>
mutate(winner = ifelse(diff_current < 0, "dem", "rep")) |>
group_by(pop_size_decile) |>
summarise("dem wins" = sum(winner == "dem"),
"rep wins" = sum(winner == "rep"),
"share dem wins" = sum(winner == "dem") / n())# A tibble: 10 × 4
pop_size_decile `dem wins` `rep wins` `share dem wins`
<int> <int> <int> <dbl>
1 1 13 221 0.0556
2 2 25 209 0.107
3 3 29 205 0.124
4 4 23 210 0.0987
5 5 21 212 0.0901
6 6 28 205 0.120
7 7 28 205 0.120
8 8 32 201 0.137
9 9 53 180 0.227
10 10 160 73 0.687
Republicans won more counties than Democrats in the 2020 presidential election, but the Democratic tail of the distribution is much longer, indicating that large margins happened more often in counties that Democrats won than in counties that Republicans won. We know that a Republican candidate did not win in 2020. This implies that while the Republican candidate won more counties overall in 2020, Democrats disproportionately won counties with large populations. This is demonstrated in our table. In 2020, the Republican candidate won the significant majority of all counties with a below-90th percentile population size. However, the Democratic candidate won about 67% of the largest 10% of counties in the nation.
This plot also demonstrates why we decided not to model a binary outcome variable capturing whether a Republican won in each county, as such a model would be biased towards Republicans and would not be useful in informing national-level winners. Instead, we chose to use margins as our outcome variable, which capture the same information as a binary winner variable while also potentially proving more useful for national-level predictions.
# Exploring spread of values in margins variable in 2020 election
train |>
mutate(winner = ifelse(diff_current < 0, "dem", "rep")) |>
group_by(winner) |>
summarize("Max margin" = max(diff_current),
"Min margin" = min(diff_current),
"Mean margin" = mean(diff_current),
"SD margin"= sd(diff_current))# A tibble: 2 × 5
winner `Max margin` `Min margin` `Mean margin` `SD margin`
<chr> <dbl> <dbl> <dbl> <dbl>
1 dem -23 -1883355 -48231. 133067.
2 rep 119005 14 7191. 9324.
The average Democrat-won county in 2020 has a much larger margin (46,018 votes) than the average Republican-won county (6,920 votes). This is unsurprising, as Democrats are more likely to win large, urban counties. The distribution of the margin for Democrat-won counties is also much wider.
# Do patterns hold for the 2016 election
train |>
mutate(winner = ifelse(diff_previous < 0, "dem", "rep")) |>
group_by(winner) |>
summarize("Max margin" = max(diff_previous),
"Min margin" = min(diff_previous),
"Mean margin" = mean(diff_previous),
"SD margin"= sd(diff_previous))# A tibble: 2 × 5
winner `Max margin` `Min margin` `Mean margin` `SD margin`
<chr> <dbl> <dbl> <dbl> <dbl>
1 dem -15 -1273485 -40375. 109652.
2 rep 104444 2 6639. 8760.
These patterns holds for the 2016 election as well.
# Exploring these margin outliers for the democrats in 2020 presidential election
train |>
summarize(outlier_dem = sum(diff_previous < -104444))# A tibble: 1 × 1
outlier_dem
<int>
1 35
train |>
filter(diff_previous < -104444) |>
summarize(min_dem_outlier = abs(min(diff_previous)),
max_dem_outlier = abs(max(diff_previous)),
mean_dem_outlier = abs(mean(diff_previous)),
sd_dem_outleir = sd(diff_previous))# A tibble: 1 × 4
min_dem_outlier max_dem_outlier mean_dem_outlier sd_dem_outleir
<dbl> <dbl> <dbl> <dbl>
1 1273485 104746 280466. 244136.
In the 2016 election, 45 values may be extreme values in the analysis. We defined Democratic extreme values as any margin greater than the highest Republican margin. This is relatively close to the number of extreme values in the 2020 presidential election. Similarly, the standard deviation is relatively high as well. This may mean that the values are not close together, varying greatly. However, the mean is decently close to the minimum value, telling us that the extreme values with the largest magnitude may be pulling these values upwards, and the rest of the extreme values are closer to the -104746 value. Since the number of extreme values is decently similar between the 2016 and 2020 presidential election (53 and 45), this may not be of strong concern. These extreme values may also be of less concern since they make up such a small portion of the data set. The values we would be concerned about make up about 1.7%.
#What counties make up these extreme values in both the 2016 and 2020 elections?
outliers <- train |>
filter(diff_current < -119005 | diff_previous < -104444)
print(table(outliers$state_name))
California Colorado Connecticut
8 1 1
District of Columbia Florida Georgia
1 3 2
Illinois Louisiana Maryland
1 1 1
Massachusetts Michigan Minnesota
3 1 1
Missouri New Jersey New York
1 1 3
North Carolina Ohio Oregon
2 2 1
Pennsylvania Texas Virginia
2 2 1
Wisconsin
2
As expected, California repeats the most in the extreme values for democrat wins. Interestingly, some swing states show up in this. Since so few show up in this analysis, and many in places we would expect a democrat win, this may not be of great concern. However, it is interesting to note. Possibly, state could be interesting predictor for a place like California.
#Education distribution by County Winner
train |>
select(state_name, prop_less_than_hs, prop_hs_grad, prop_some_college_associates, prop_bachelors_higher, diff_current) |>
mutate(winner = ifelse(diff_current < 0, "Dem", "Rep")) |>
group_by(winner) |>
pivot_longer(cols = c(prop_less_than_hs, prop_hs_grad, prop_some_college_associates, prop_bachelors_higher),
names_to = "education_level", values_to = "proportion") |>
mutate(proportion_two = proportion / sum(proportion) * 100) |>
ggplot(aes(x = winner, y = proportion_two, fill = education_level)) +
geom_bar(stat = "identity") +
labs(title = "Education Distribution by County Winner in 2020", y = "Proportion (%)", x = "Winner", fill = "Level of Education") +
scale_fill_manual(values = c("prop_less_than_hs" = "#1f77b4",
"prop_hs_grad" = "#ff7f0e",
"prop_some_college_associates" = "#2ca02c",
"prop_bachelors_higher" = "#d62728"),
labels = c("prop_less_than_hs" = "Less Than High School",
"prop_hs_grad" = "High School Graduate",
"prop_some_college_associates" = "Some College or Associates Degree",
"prop_bachelors_higher" = "Bachelor's or Higher")) +
scale_y_continuous(labels = scales::percent) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank())This graph suggests that counties who voted Democratic in 2020 had a higher number of residents with some college education. We capture educational attainment in the model using a number of predictors, as the leftward political shift of college-educated individuals in recent decades has made educational patterns a potentially useful indicator for voting outcomes.
# Is there a relationship between education levels and who wins in the presidential election?
train |>
filter(diff_current > -100000) |> # filtering out the two outliers to better see data spread
ggplot() +
geom_point(aes(x = prop_less_than_hs, y = diff_current),
alpha = 0.3) +
geom_smooth(mapping = aes(x = prop_less_than_hs,
y = diff_current),
method = "lm",
se = FALSE,
color = "grey",
linetype = "dashed") +
labs(title = "Association between proportion of counties with \nless than high school educ. and margin",
x = "Proportion less than high school",
y = "Differential in number of votes")train |>
filter(diff_current > -100000) |> # filtering out the two outliers to better see data spread
ggplot() +
geom_point(aes(x = prop_hs_grad, y = diff_current),
alpha = 0.3) +
geom_smooth(mapping = aes(x = prop_hs_grad,
y = diff_current),
method = "lm",
se = FALSE,
color = "grey",
linetype = "dashed") +
labs(title = "Association between proportion of counties with \nhigh school educ. and margin",
x = "Proportion of high school graduates",
y = "Differential in number of votes")train |>
filter(diff_current > -100000) |> # filtering out the two outliers to better see data spread
ggplot() +
geom_point(aes(x = prop_some_college_associates, y = diff_current),
alpha = 0.3) +
geom_smooth(mapping = aes(x = prop_some_college_associates,
y = diff_current),
method = "lm",
se = FALSE,
color = "grey",
linetype = "dashed") +
labs(title = "Association between proportion of counties with \nsome college or associate's and margin",
x = "Proportion some college or associates degree",
y = "Differential in number of votes")train |>
filter(diff_current > -100000) |> # filtering out the two outliers to better see data spread
ggplot() +
geom_point(aes(x = prop_bachelors_higher, y = diff_current),
alpha = 0.3) +
geom_smooth(mapping = aes(x = prop_bachelors_higher,
y = diff_current),
method = "lm",
se = FALSE,
color = "grey",
linetype = "dashed") +
labs(title = "Association between proportion of counties with a \nbachelor's degree or higher and margin",
x = "Proportion bachelor's degree or higher",
y = "Differential in the number of votes")The proportion of people with bachelors degree or higher and its association with the margin winner is the strongest out of all education varibles, since it is the steepest line. Additionally, this association is most interesting, as it is also negative. Some college associates is also negative, although not as steep. Less than high school and margins winner is the flattest, telling us there is little relationship between these two, although it is positively associated. High school grad and margin have a moderately psotive relationship. This could mean that two key predictors for education in this model may be bachelors or higher and high school grad.
train |>
ggplot() +
geom_point(aes(x = population_density,
y = diff_current),
alpha = 0.5,
color = "pink") +
geom_smooth(mapping = aes(x = population_density,
y = diff_current),
method = "lm",
se = FALSE,
color = "grey",
linetype = "dashed") +
labs(
title = "Population Density and Margin in 2020",
x = "Population Density",
y = "Margin") An increase in the population density is associated with a decrease in the margin size. This means that an increase in population density is associated with an increase in likelihood of a Democratic win.
# Choosing Economic Variables
train |>
ggplot() +
geom_point(aes(x = unemployment_rate,
y = diff_current,
color = poverty_rate >= 0.2),
alpha = 0.3) +
geom_smooth(mapping = aes(x = unemployment_rate,
y = diff_current),
method = "lm",
se = FALSE,
color = "black",
linetype = "dashed") +
labs(title = "Unemployment Rate and Margin",
x = "Unemployment rate",
y = "Differential in the number of votes") +
scale_y_continuous(labels = scales::number_format(scale = 1), limits = c(-200000, 200000)) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "green")) This graph shows that lower unemployment rates are associated with republican wins (Republican win = positive margin). Interestingly, higher unemployment rates are not necessarily associated with negative margins, or Democratic party wins. The summary line shows that while the association between the party win and unemployment rates is negative (an increase in unemployment rate is associated with a decrease in margins), it is barely negative. This means the association between who wins a presidential election and unemployment rate might not be helpful. As expected though, counties with higher unemployment rates are more likely to fall below the poverty line, which we defined as 20% (based on the USDA, Economic Research Service’s (ERS) Poverty Area Measures). It is also interesting to note that places with a high poverty rate seem to have a smaller margin, and do not necessarily fall into a democrat winner or a republican winner. That being said, counties with a poverty rate above twenty percent are more likely to vote for a democrat in the presidential election than those with poverty rate below twenty percent.
# Further poverty rate exploration
unemployment_graph_1 <- train |>
ggplot() +
geom_point(aes(x = unemployment_rate,
y = diff_current),
alpha = 0.3) +
geom_smooth(mapping = aes(x = unemployment_rate,
y = diff_current),
method = "lm",
se = FALSE,
color = "grey",
linetype = "dashed") +
geom_vline(xintercept = 0.1, linetype = "dotted", color = "blue") +
annotate("text", x = 0.11, y = max(finaldata2020$diff_previous, na.rm = TRUE),
label = "High Unemployment Rate, 10%", color = "blue", hjust = 0) +
labs(title = "Unemployment rate and margins in 2020 election",
subtitle = "Including extreme dem values",
x = "Unemployment rate",
y = "Differential in number of votes")
unemployment_graph_2 <- train |>
filter(diff_current > -100000) |> # filtering out the two outliers to better see data spread
ggplot() +
geom_point(aes(x = unemployment_rate,
y = diff_current),
alpha = 0.3) +
geom_smooth(mapping = aes(x = unemployment_rate,
y = diff_current),
method = "lm",
se = FALSE,
color = "grey",
linetype = "dashed") +
geom_vline(xintercept = 0.1, linetype = "dotted", color = "blue") +
annotate("text", x = 0.11, y = max(finaldata2020$diff_previous, na.rm = TRUE),
label = "High Unemployment Rate, 10%", color = "blue", hjust = 0) +
labs(title = "",
subtitle = "Excluding extreme dem values",
x = "Unemployment rate",
y = "")
unemployment_graph_1 + unemployment_graph_2In these graphs, we checked to see how unemployment rate might be associated with the margins variable in both the 2020, both with and without the outliers in the margins variable. In both graphs, there is little to no association between the two. The vertical line represents where high unemployment rate is, 10% (based on the Organization for Economic Co-operation and Development 2013 factbook). It does not appear that having a high unemployment rate would change the non-relationship between unemployment rate and margins variable. These graphs demonstrate that unemployment rate may not be a strong predictor. Although it looks like more counties with a high unemployment rate favored the democratic party compared to those with a low unemployment rate, it does not appear to be more by a significant amount.
# Choosing inequality estimators
train |>
mutate(winner = ifelse(diff_current < 0, "Dem", "Rep")) |>
group_by(winner) |>
ggplot() +
geom_point(aes(x = poverty_rate, y = gini, color = winner),
alpha = 0.3) +
geom_smooth(mapping = aes(x = poverty_rate,
y = gini,
group = winner,
color = winner),
method = "lm",
se = FALSE,
linetype = "dashed") +
scale_color_manual(values = c("Dem" = "blue", "Rep" = "red")) +
labs(title = "Poverty Rate vs. Gini Index by Winner",
x = "Poverty Rate",
y = "Gini Index")This graph shows that an increase in poverty rate is associated with an increase in the gini index, or income inequality, regardless of what party won the county. What is interesting is that for counties where a republican candidate won, the summary line starts at a lower point in the graph and ends higher. That is, the republican line is steeper than the democratic line. This could mean that for republican-winning counties, the relationship between poverty rate and gini index is more negatively associated then democrat winning counties. This graph looks like democrat winning counties fall higher on the gini index and poverty rate. Below, we dive into this relationship further.
# Exploring gini index and poverty rate relationship with winning party
train |>
ggplot() +
geom_point(aes(x = poverty_rate,
y = diff_current),
alpha = 0.3) +
geom_smooth(mapping = aes(x = poverty_rate,
y = diff_current)) +
labs(title = "Association between Poverty Rate and Margins Winner in 2020",
x = "Poverty rate",
y = "Differential in number of votes")train |>
ggplot() +
geom_point(aes(x = gini,
y = diff_current),
alpha = 0.3) +
geom_smooth(mapping = aes(x = poverty_rate,
y = diff_current)) +
labs(title = "Association between Gini Index and Margins Winner in 2020",
x = "Gini",
y = "Differential in number of votes")train |>
ggplot() +
geom_point(aes(x = median_housing_costs,
y = diff_current),
alpha = 0.3) +
geom_smooth(mapping = aes(x = median_housing_costs,
y = diff_current)) +
labs(title = "Association between Median Housing Cost and Margins Winner in 2020",
x = "Median monthly cost in housing",
y = "Differential in number of votes")Neither poverty rate or gini index on their own seem to be a significant predictor of who won the county in the 2020 presidential election. Since a previous graph shows that relationship between the gini index and poverty rate seems to have a more significant effect in counties where republicans won, this could mean an interaction of the gini and poverty variables may be a helpful predictor.
Median monthly housing costs, on the other hand, seems to be an effective predictor – there seems to be a clear relationship between higher median monthly housing costs and preference for the Democratic party.
# Distribution
train |>
select(state_name, prop_18_to_24, prop_65_years_older, diff_current) |>
mutate(winner = ifelse(diff_current < 0, "Dem", "Rep"),
prop_other = 1 - (prop_18_to_24 + prop_65_years_older)) |>
group_by(winner) |>
summarize(
avg_18_to_24 = mean(prop_18_to_24, na.rm = TRUE) * 100,
avg_65_years_older = mean(prop_65_years_older, na.rm = TRUE) * 100,
avg_other = mean(prop_other, na.rm = TRUE) * 100,
.groups = "drop"
) |>
pivot_longer(
cols = c(avg_18_to_24, avg_65_years_older, avg_other),
names_to = "age_group",
values_to = "avg_proportion"
) |>
ggplot(aes(x = winner, y = avg_proportion, fill = age_group)) +
geom_bar(stat = "identity") +
labs(
title = "Average Age Group Proportions by County Winner in 2020",
y = "Average Proportion (%)",
x = "Winner"
) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))As expected, the age group that seems to be different between counties where a democrat won versus a republican is the 18 to 24 category. It appears that younger people vote more for the democratic candidate than the republican candidate. As expected, the republican counties had a higher share of the older age population voting for them. This falls in line with our intuition and tells us its likely important to include both in our model.
train |>
select(county_fips, diff_current, prop_white, prop_black, prop_hispanic) |>
filter(diff_current > -1000000) |> # removing democratic outliers
pivot_longer(cols = c(
"prop_white",
"prop_black",
"prop_hispanic"),
names_to = "ethnicity",
values_to = "proportion"
) |>
ggplot() +
geom_point(mapping = aes(x = proportion,
y = diff_current,
color = ethnicity),
alpha = 0.3) +
geom_smooth(mapping = aes(x = proportion,
y = diff_current),
color = "darkgrey") +
scale_x_continuous(breaks = c(0,1)) +
facet_wrap(~ ethnicity) +
labs(title = "Vote differentials by ethnic proportions",
x = "Proportion of ethnicity",
y = "Differential in number of votes",
color = "Ethnicity")train |>
select(county_fips, diff_current, prop_white, prop_black, prop_hispanic) |>
pivot_longer(cols = c(
"prop_white",
"prop_black",
"prop_hispanic"),
names_to = "ethnicity",
values_to = "proportion"
) |>
ggplot() +
geom_smooth(mapping = aes(x = proportion,
y = diff_current,
color = ethnicity),
alpha = 0.2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
labs(title = "Vote differentials by ethnic proportions \nwith confidence intervals",
x = "Proportion of ethnicity",
y = "Differential in number of votes",
color = "Ethnicity")train |>
summarize(proportion_white_over_0.75 = mean(prop_white > 0.75), # 0.77 of counties have a prop_white of over 0.75
proportion_white_over_0.8 = mean(prop_white > 0.8), # 0.706 of counties have a prop_white of over 0.8
proportion_white_over_0.85 = mean(prop_white > 0.85)) # 0.631 of counties have a prop_white of over 0.85# A tibble: 1 × 3
proportion_white_over_0.75 proportion_white_over_0.8 proportion_white_over_0…¹
<dbl> <dbl> <dbl>
1 0.769 0.706 0.629
# ℹ abbreviated name: ¹proportion_white_over_0.85
The graphs show an almost quadratic relationship between the three listed ethnicities and vote differentials. While the relationship between the proportion of black residents and vote differentials remains negative throughout the distribution, vote differentials dip into the positives with high hispanic and white proportional values. Considering the confidence intervals, a high proportion of white residents is likely to vote Republican and, given the fact that most counties are predominantly white – with 77% of which having a white-proportion over three-quarters – the relationship owes to the idea that most counties overwhelmingly vote for the Republican Party. Regardless of county trends, the graphs show a clear relationship between ethnic proportions and voting differentials.
medianage_plot1 <- train |>
select(diff_current, median_age) |>
ggplot() +
geom_point(mapping = aes(x = median_age, y = diff_current),
color = "indianred3",
alpha = 0.3) +
geom_smooth(mapping = aes(x = median_age, y = diff_current),
color = "lightsteelblue4") +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
geom_vline(xintercept = 41.5, linetype = "dashed", color = "black") +
labs(title = "Vote differentials by median age",
x = "Median age",
y = "Differential in number of votes")
medianage_plot2 <- train |>
select(diff_current, median_age) |>
filter(diff_current > -50594 & diff_current < 7081) |>
ggplot() +
geom_point(mapping = aes(x = median_age, y = diff_current),
color = "indianred3",
alpha = 0.3) +
geom_smooth(mapping = aes(x = median_age, y = diff_current),
color = "lightsteelblue4") +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
geom_vline(xintercept = 41.5, linetype = "dashed", color = "black") +
labs(title = "",
subtitle = "Excluding dem extreme values",
x = "Median age",
y = "")
medianage_plot1 + medianage_plot2Counties with a younger median age are more likely to have a differential in favor of the Democratic party, a trend in line with contemporary wisdom that younger Americans are usually more receptive to the Democratic party in elections. Counties with a median age equal to traditionally-defined middle-aged years tend to vote for the Republican party. Interestingly, as the median age enters elderly ages, the vote differential tends closer towards zero and even towards the Democratic party, perhaps indicating that the relationship between median age and vote differentials is according to a cubed polynomial: younger counties tend to be Democratic, middle aged counties Republican, and elderly counties still Republican although to a lesser degree.
foreignborn_plot1 <- train |>
select(diff_current, prop_foreign_born_citizen, prop_undocumented) |>
mutate(prop_foreign_born = prop_foreign_born_citizen + prop_undocumented,
prop_undocumented_to_foreign = prop_undocumented / prop_foreign_born) |>
ggplot() +
geom_point(mapping = aes(x = prop_foreign_born, y = diff_current, color = prop_undocumented_to_foreign),
alpha = 0.6) +
scale_color_continuous(
low = "darkolivegreen1",
high = "black"
) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
geom_vline(xintercept = 0.0753, linetype = "dashed", color = "black") +
labs(title = "Vote Differentials by Proportion of Foreign-born \nand Undocumented Residents",
x = "Proportion Foreign Born",
y = "Differential in Number of Votes",
color = "") +
guides(color = "none")
foreignborn_plot2 <- train |>
select(diff_current, prop_foreign_born_citizen, prop_undocumented) |>
mutate(prop_foreign_born = prop_foreign_born_citizen + prop_undocumented,
prop_undocumented_to_foreign = prop_undocumented / prop_foreign_born) |>
filter(diff_current > -1000000) |>
ggplot() +
geom_point(mapping = aes(x = prop_foreign_born, y = diff_current, color = prop_undocumented_to_foreign),
alpha = 0.6) +
scale_color_continuous(
low = "darkolivegreen1",
high = "black"
) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
geom_vline(xintercept = 0.0753, linetype = "dashed", color = "black") +
labs(title = "",
subtitle = "Excluding dem extreme values",
x = "",
y = "",
color = "Proportion of Undocumented \nStatus among Foreign \nBorn Individuals")
foreignborn_plot1 + foreignborn_plot2Counties with a higher proportion of immigrants are more likely to have a differential in favor of the Democratic party, demonstrating a clear relationship between the proportion of immigrants and vote outcomes. Interestingly, as the proportion of undocumented increases, the vote differential tends towards zero, perhaps as the result in the increase in the proportion of non-voting residents. As noted, counties with a higher proportion of immigrants are more likely to vote in favor of the Democratic party – however, if in those same counties a high proportion of those immigrants are undocumented, then a greater portion of Democratic-leaning residents are unable to actually legally vote, thus offsetting any potential Democratic gains in votes and skewing the differential towards zero. This presents an interesting and perhaps inverse relationship between counties and their immigrant and citizenship makeups.
V-fold cross-validation
set.seed(12071999)
train_folds <- vfold_cv(data = train, v = 10)Testing models using 2020 data
Create a recipe
recipe <-
recipe(diff_current ~ diff_previous + prop_less_than_hs + prop_bachelors_higher + prop_18_to_24 + prop_65_years_older + prop_white + prop_black + prop_hispanic + poverty_rate + unemployment_rate + male_ratio_per_100_females + median_age + median_income + gini + median_housing_costs + prop_foreign_born_citizen + prop_undocumented + population_density + total_population + swing,
data = train) |>
step_impute_knn(all_predictors()) |>
step_corr(all_predictors()) |>
step_log(median_income, median_housing_costs) |>
step_interact(terms = ~ swing*diff_previous) |>
step_normalize(all_predictors())Why did we choose these steps for our recipe?
We chose to imputate using knn to ensure that any missing predictors, such as the missing Connecticut 2020 elections results for the 2024 data, have an estimation. We chose knn instead of mean imputation because we felt this would provide an estimate that was based on more evidence (or observations), hopefully improving the accuracy of our missing data.
We also chose to include the step that removes highly correlated predictors. We wanted to ensure that we were not including predictors that were too related and hurt our model. Including this step was helpful, since it removed the median age as a predictor.
We logged two predictors, median income and median housing costs. This is because median income and housing costs have a wide range of values. By logging the variables, we improve the symmetry of our data. This prevents extreme observations in the form of outliers from skewing and potentially biasing the predictions.
We decided to include a step to interact the swing county dummy predictor with the previous presidential differential result predictor. We chose to do this to ensure that our model did not necessarily estimate a republican winner because the previous differential shows that a republican won, or vice versa for a democratic candidate win. We also thought it would be important to note how being a swing county might impact the magnitude of the differential. Thus, we included an interaction between the swing dummy predictor and the previous presidential election differential results.
We normalized all predictors because of inherent differences across our types of predictors. This is in line with the requirement of LASSO models that all predictors should be centered and scaled in order to standardize coefficient units.
We included total population as a predictor in our recipe in place of using weights.
LASSO
set.seed(12071999)
lasso_spec <- linear_reg(penalty = tune(),
mixture = 1) |>
set_mode(mode = "regression") |>
set_engine(engine = "glmnet")
lasso_wf <- workflow() |>
add_recipe(recipe) |>
add_model(lasso_spec)
lasso_grid <- grid_regular(
penalty(range = c(0, 10)),
levels = 10)
lasso_resamples <- lasso_wf |>
tune_grid(resamples = train_folds,
grid = lasso_grid)
lasso_final_wf <- lasso_wf |>
finalize_workflow(select_best(lasso_resamples))
lasso_final_fit <-
lasso_final_wf |>
last_fit(modeling_sample) |>
collect_metrics() |>
print()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 6369. Preprocessor1_Model1
2 rsq standard 0.982 Preprocessor1_Model1
The LASSO model out-of-sample RMSE is 6,369.
Regression Tree
set.seed(12071999)
tree_spec <-
decision_tree(cost_complexity = tune()) |>
set_engine(engine = "rpart") |>
set_mode(mode = "regression")
tree_wf <-
workflow() |>
add_recipe(recipe) |>
add_model(tree_spec)
tree_grid <- grid_regular(
cost_complexity(range = c(0, 0.1)),
levels = 10)
tree_resamples <- tree_wf |>
tune_grid(resamples = train_folds,
grid = tree_grid)
tree_final_wf <- tree_wf |>
finalize_workflow(select_best(tree_resamples))
tree_final_fit <-
tree_final_wf |>
last_fit(modeling_sample) |>
collect_metrics() |>
print()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 41626. Preprocessor1_Model1
2 rsq standard NA Preprocessor1_Model1
The Regression Tree model out-of-sample RMSE is 41,626.
KNN
set.seed(12071999)
knn_spec <- nearest_neighbor(neighbors = tune()) |>
set_engine(engine = "kknn") |>
set_mode(mode = "regression")
knn_wf <- workflow() |>
add_recipe(recipe) |>
add_model(knn_spec)
knn_grid <- grid_regular(
neighbors(range = c(1,99)),
levels = 10)
knn_resamples <- knn_wf |>
tune_grid(resamples = train_folds,
grid = knn_grid,
metrics = metric_set(rmse))
knn_final_wf <- knn_wf |>
finalize_workflow(select_best(knn_resamples))
knn_final_fit <-
knn_final_wf |>
last_fit(modeling_sample) |>
collect_metrics() |>
print()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 15241. Preprocessor1_Model1
2 rsq standard 0.911 Preprocessor1_Model1
The KNN model out-of-sample RMSE is 15,241.
The LASSO model has the lowest out-of-sample RMSE, so we will use it for our model implementation using the 2024 data.
Final LASSO model estimation on 2024 data
set.seed(12071999)
final_model <- lasso_final_wf |>
fit(data = finaldata2024)
predictions2024 <- bind_cols(
finaldata2024,
predict(object = final_model, new_data = finaldata2024)
)
print(predictions2024) |>
select(diff_current, .pred)# A tibble: 3,110 × 38
state_name county_name county_fips total_votes_current votes_gop_current
<chr> <chr> <chr> <dbl> <dbl>
1 Alabama Autauga County 1001 28139 20447
2 Alabama Baldwin County 1003 120973 95144
3 Alabama Barbour County 1005 9766 5578
4 Alabama Bibb County 1007 9230 7563
5 Alabama Blount County 1009 28024 25271
6 Alabama Bullock County 1011 4104 1099
7 Alabama Butler County 1013 8459 5167
8 Alabama Calhoun County 1015 48435 34841
9 Alabama Chambers County 1017 14215 8704
10 Alabama Cherokee County 1019 12965 11342
# ℹ 3,100 more rows
# ℹ 33 more variables: votes_dem_current <dbl>, diff_current <dbl>,
# per_gop_current <dbl>, per_dem_current <dbl>, per_point_diff_current <dbl>,
# total_votes_previous <dbl>, votes_gop_previous <dbl>,
# votes_dem_previous <dbl>, diff_previous <dbl>,
# per_point_diff_previous <dbl>, land_area <dbl>, total_population <dbl>,
# population_density <dbl>, prop_less_than_hs <dbl>, prop_hs_grad <dbl>, …
# A tibble: 3,110 × 2
diff_current .pred
<dbl> <dbl>
1 13018 11024.
2 70381 44171.
3 1458 2844.
4 5946 6174.
5 22702 17882.
6 -1884 631.
7 1919 3318.
8 21671 16441.
9 3302 3975.
10 9792 8587.
# ℹ 3,100 more rows
lasso_final_wf |>
last_fit(modeling_sample) |>
collect_metrics() |>
print()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 6369. Preprocessor1_Model1
2 rsq standard 0.982 Preprocessor1_Model1
finaldata2024 |>
mutate(actual_winner = ifelse(diff_current < 0, "Dem", "Rep")) |>
group_by(actual_winner) |>
summarise(mean(diff_current))# A tibble: 2 × 2
actual_winner `mean(diff_current)`
<chr> <dbl>
1 Dem -41581.
2 Rep 8004.
Our final LASSO model RMSE is 6,369. This is a fairly high RMSE, considering that the voting margin is, on average, about 8,000 votes in Republican-won counties and 41,580 votes in Democrat-won counties. This model may not be the best for predicting precise margins on the county level. Does the model perform better as a predictor of binary outcomes?
Calculating binary outcome variable (winning party)
Our model in unique in that, in addition to predicting voting margins, it can predict binary county outcomes (Republican or Democrat winner) based on whether the predicted margin is positive or negative. We write code for this binary prediction to further evaluate our model using a confusion matrix and other measures of prediction accuracy.
predictions2024 <- predictions2024 |>
mutate(actual_winner = ifelse(diff_current < 0, "Dem", "Rep"),
rep_actual_winner = as_factor(ifelse(diff_current > 0, 1, 0)),
predicted_winner = ifelse(.pred < 0, "Dem", "Rep"),
rep_pred_winner = as_factor(ifelse(.pred > 0, 1, 0)))
conf_mat(data = predictions2024,
truth = rep_actual_winner,
estimate = rep_pred_winner) Truth
Prediction 0 1
0 316 23
1 137 2634
accuracy(data = predictions2024,
truth = rep_actual_winner,
estimate = rep_pred_winner,
event_level = "second")# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.949
precision(data = predictions2024,
truth = rep_actual_winner,
estimate = rep_pred_winner,
event_level = "second")# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 precision binary 0.951
recall(data = predictions2024,
truth = rep_actual_winner,
estimate = rep_pred_winner,
event_level = "second")# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 recall binary 0.991
specificity(data = predictions2024,
truth = rep_actual_winner,
estimate = rep_pred_winner,
event_level = "second")# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 specificity binary 0.698
Our model has a 95% accuracy rate, which means that it correctly predicts the party of the U.S. presidential candidate that won the county in 2024 95% of the time. The model’s precision rate is 95%, which means that when predicting a positive result (i.e., a Republican win), the model is right 95% of the time. The model’s recall rate is 99%, which means that it accurately predicts 99% of Republican-won counties
However, these high precision, accuracy, and recall scores may be misleading, as the vast majority of counties in 2024 (2657 / 3110 observations, or 85%) voted Republican. Our model could have high accuracy, precision, and recall rates simply by predicting Republican every time, but it would not be a good model.
The model’s specificity, or the rate at which the model correctly predicts true negative (Democrat-won) observations, is thus a much better indicator of model performance. Our specificity rate is 70%, which is better than random guessing, but reveals the weakness of this model. Overall, the model still does a pretty good job at predicting the winning presidential party on the county-level.
Comparing predicted vs actual electoral college results
# by electoral college
electoral_college <- read_csv("data/Electoral_College.csv") |>
select(!Abb_State) |>
rename(state_name = Full_State)
electoral_votes_state <- predictions2024 |>
group_by(state_name) |>
summarize(sum_diff_current = sum(diff_current),
sum_diff_prediction = sum(.pred)) |>
mutate(actual_winner = ifelse(sum_diff_current < 0, "Dem", "Rep"),
pred_winner = ifelse(sum_diff_prediction < 0, "Dem", "Rep")) |>
left_join(electoral_college, by = "state_name")
electoral_votes_state |>
group_by(actual_winner) |>
summarize(sum_electoral_votes = sum(Electoral_College_Votes)) # actual winner was republican with 308 electoral votes# A tibble: 2 × 2
actual_winner sum_electoral_votes
<chr> <dbl>
1 Dem 227
2 Rep 308
electoral_votes_state |>
group_by(pred_winner) |>
summarize(sum_electoral_votes = sum(Electoral_College_Votes)) # predicted winner was republican with 323 electoral votes# A tibble: 2 × 2
pred_winner sum_electoral_votes
<chr> <dbl>
1 Dem 212
2 Rep 323
# maps
actual_2024_map <- state_geospatial |>
left_join(electoral_votes_state, by = "state_name") |>
filter(!is.na(actual_winner)) |>
select(actual_winner, geometry) |>
ggplot() +
geom_sf(mapping = aes(fill = actual_winner)) +
scale_fill_manual(values = c(
"Dem" = "royalblue4",
"Rep" = "firebrick3")
) +
labs(title = "Electoral map (2024)",
fill = "Winner")
predicted_2024_map <- state_geospatial |>
left_join(electoral_votes_state, by = "state_name") |>
filter(!is.na(pred_winner)) |>
select(pred_winner, geometry) |>
ggplot() +
geom_sf(mapping = aes(fill = pred_winner)) +
scale_fill_manual(values = c(
"Dem" = "royalblue4",
"Rep" = "firebrick3")
) +
labs(title = "Predicted electoral map (2024)",
fill = "")
actual_2024_map + predicted_2024_mapOur model is not precise enough to predict voting margins well, and it skews Republican (Republican wins are overpredicted and Democratic wins are underpredicted). In part due to its Republican-skew, the model performs fairly well in predicting the outcome of the 2024 presidential election. Although the RMSE of the final model may be of concern, the aggregate state outcomes convey an accurate prediction of the 2024 presidential election, correctly anticipating a Donald Trump victory (though by a wider margin than he actually won). Our model suggests that, despite the assumingly unprecedented likelihood of Trump’s re-election backed by popular media, his victory could have been predicted utilizing county demographic, economic, and political factors that we included in our LASSO model.
Additional weaknesses: inability to logarithmize margins and missing data
The primary outcome, diff_current, describes the differential in the number of votes between the Republican and Democratic candidates. Since counties vary greatly in population, the RMSE of diff_current is larger than the population of some counties, let alone the total number of votes. Utilizing a logarithmic version of our primary outcome variable would have been best practice in order to control for this drawback while also maintaining the ability to aggregate the differential at the state level. However, the inclusion of a step_log with respect to diff_current function in our recipe resulted in errors, forcing us to forego the logarithmic transformation of our outcome.
There are two instances of significant missing data in particular. Both models exclude the State of Alaska and every one of the state’s boroughs, census areas, or county equivalents, due to a mismatch in unit of observations reported in the predictor and vote data. Since no clear connection between the Alaskan observations could be made for the entire state, all observations from Alaska were dropped. Additionally, the State of Connecticut reconstructed their census-designated statistical areas and, starting in 2022, the Census Bureau began to produce estimates for these newly-designated areas instead of Connecticut’s counties. Although the ACS 1-year estimates and the voting data has information for Connecticut’s census areas in 2022 and 2024, respectively, CT’s census areas do not have previous years voting data due to their recent creation, resulting in missing data for the diff_previous predictor. Instead of dropping Connecticut data altogether, though, we decided to utilize a step_impute_knn with respect to missing data function in order to conserve CT’s remaining predictors.
What would we do differently next time?
We encountered issues with the fact that the number of county votes in total varies. That is, not all counties contribute in the same magnitude to the presidential results on a national scale. Some counties contribute approximately 10,000 votes to the presidential election. Others may contribute 100,000 votes to the presidential election. In an attempt to solve this problem, we chose to predict differentials at the county level. The consequences of this choice was that our model RMSE does not provide a value that is easy to interpret. The number it provides is larger than the number of votes a county may have in total. Thus, the RMSE is difficult to interpret. Next time, we would like to figure out a way to solve this problem. We attempted to log our outcome variable in the step function in our recipe, but the model did not run when we tried this. Estimating the actual magnitude of the differentials in a presidential election was the most difficult part of this process. In a future attempt, we encourage the consideration of these issues and an appropriate remedy to more accurately predict the voting margins at the county level.
Happy Holidays!